Load in data

The dataset to be loaded in is entitled ‘cleaned_df_complete.csv’. Set working directory to source file location. This is the full dataset for INDICATIVE items only.

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
cleaned_df <- read_csv("cleaned_df_2504fix.csv")
## Parsed with column specification:
## cols(
##   response = col_double(),
##   Tensec = col_character(),
##   year_scaled = col_double(),
##   year = col_double(),
##   GENDER = col_character(),
##   RANKNR = col_character(),
##   ORIGINREGION = col_character(),
##   AGE_scaled = col_double(),
##   NAME = col_character(),
##   LETTER = col_character()
## )

Levels and contrasts

We start with the cleaned indicative dataset. First ensure that factors and contrasts are set properly. Reference level for GENDER is MALE and reference level for ORIGINREGION is SW (including Reykjavík).

cleaned_df$RANKNR <- factor(cleaned_df$RANKNR)
rank_levels <- levels(cleaned_df$RANKNR)
print(rank_levels)
## [1] "OFFICIALS/LETTERED" "OTHER PROFESSIONS"  "PEASANTS/LABOURERS"
cleaned_df$NAME <- as.factor(cleaned_df$NAME)
levels(cleaned_df$NAME)
##   [1] "Aðalbjartur Bjarnason (Albert Arnason)"
##   [2] "Albert Samúelsson"                     
##   [3] "Aldís Laxdal"                          
##   [4] "Andrés Fjeldsted"                      
##   [5] "Andrés Kjerúlf"                        
##   [6] "Andría María Andrésdóttir"             
##   [7] "Árni Helgason"                         
##   [8] "Árni Jóhannsson"                       
##   [9] "Arnór Björnsson"                       
##  [10] "Ásgeir Tryggvi Friðgeirsson"           
##  [11] "Baldvin Einarsson"                     
##  [12] "Benedikt Arason"                       
##  [13] "Benedikt Hálfdanarson"                 
##  [14] "Bjarni Brynjólfsson"                   
##  [15] "Bjarni Thorarensen"                    
##  [16] "Björg Magnúsdóttir"                    
##  [17] "Davíð Valdimarsson"                    
##  [18] "Dómhildur Briem"                       
##  [19] "Einar Andrésson"                       
##  [20] "Einar Ásmundsson"                      
##  [21] "Einar Kristjánsson"                    
##  [22] "Eiríkur Eiríksson"                     
##  [23] "Eiríkur Jóhannsson"                    
##  [24] "Erlendur Ólafsson"                     
##  [25] "Eyjólfur Nikulásson"                   
##  [26] "Finnbogi Kristófersson"                
##  [27] "Friðgeir Olgeirsson"                   
##  [28] "Geir Gunnarsson"                       
##  [29] "Geir Vídalín biskup"                   
##  [30] "Gísli Benjamínsson"                    
##  [31] "Gísli Hjálmarsson"                     
##  [32] "Gísli Jónsson"                         
##  [33] "Guðjón Vigfússon"                      
##  [34] "Guðmundur Magnússon"                   
##  [35] "Guðný Jónsdóttir"                      
##  [36] "Guðríður Magnúsdóttir"                 
##  [37] "Guðrún Skúladóttir"                    
##  [38] "Gunnlaugur Einarsson"                  
##  [39] "Gunnlaugur Oddsson"                    
##  [40] "Gunnsteinn Eyjólfsson"                 
##  [41] "Halldór Pétursson"                     
##  [42] "Hallgrímur Gíslason"                   
##  [43] "Hannes Finsen"                         
##  [44] "Helga Steingrímsdóttir"                
##  [45] "Helgi Pálsson"                         
##  [46] "Hildur Jónsdóttir Johnsen"             
##  [47] "Högni Einarsson"                       
##  [48] "Ingibjörg E. Gunnlaugsdóttir"          
##  [49] "Ingibjörg Jónsdóttir"                  
##  [50] "Jakobína Jónsdóttir"                   
##  [51] "Jóhann Magnússon"                      
##  [52] "Jóhannes Guðmundsson"                  
##  [53] "Jóhannes Halldórsson"                  
##  [54] "Jón Chr. Stephánsson"                  
##  [55] "Jón Gunnsteinsson"                     
##  [56] "Jón Ívarsson"                          
##  [57] "Jón Jónsson"                           
##  [58] "Jónatan Daníelsson"                    
##  [59] "Jónatan Þorláksson"                    
##  [60] "Klemens Björnsson"                     
##  [61] "Klemens Jónsson"                       
##  [62] "Kristín Eiríksdóttir"                  
##  [63] "Kristján J. Kröyer"                    
##  [64] "Lárus Bjarnason"                       
##  [65] "Loftur Jónasson"                       
##  [66] "Malene Jensdóttir"                     
##  [67] "Margrét Daníelsdóttir"                 
##  [68] "Margrét Finnsdóttir"                   
##  [69] "Marteinn Jónsson"                      
##  [70] "Ólafur Ólafsson"                       
##  [71] "Ólafur Sigurðsson"                     
##  [72] "Páll Jóhannsson"                       
##  [73] "Ragnheiður Daníelsdóttir"              
##  [74] "Ragnheiður Finnsdóttir"                
##  [75] "Ragnheiður Þórarinsdóttir"             
##  [76] "Ragnhildur Magnúsdóttir"               
##  [77] "Sæmundur Friðriksson"                  
##  [78] "Samson Björnsson (ath)"                
##  [79] "Sigfús Jónasson Bergmann"              
##  [80] "Sigríður Guðmundsdóttir"               
##  [81] "Sigríður Hannesdóttir"                 
##  [82] "Sigríður Örum"                         
##  [83] "Sigríður Pálsdóttir"                   
##  [84] "Sigurbjörg Einarsdóttir"               
##  [85] "Sigurður Benediktsson"                 
##  [86] "Sigurður Eiríksson"                    
##  [87] "Sigurður Jónsson"                      
##  [88] "Sigurður Jónsson 1814"                 
##  [89] "Sigurður Jósúa Björnsson"              
##  [90] "Sigurður Þórðarson"                    
##  [91] "Snorri Jónsson"                        
##  [92] "Soffía Daníelsdóttir"                  
##  [93] "Stefán Kristinsson"                    
##  [94] "Stefanía Salómonsen"                   
##  [95] "Stefanía Siggeirsdóttir"               
##  [96] "Steinn D. Þórðarson"                   
##  [97] "Sveinbjörn Jónsson"                    
##  [98] "Torfi Eggerz"                          
##  [99] "Valbjörg Þorsteinsdóttir"              
## [100] "Valgerður Björnsdóttir"                
## [101] "Valgerður Jónsdóttir"                  
## [102] "Vigfús Sigfússon"                      
## [103] "Vilhjálmur Oddsen"                     
## [104] "Þórarinn Stefánsson"                   
## [105] "Þorbjörg Jónsdóttir"                   
## [106] "Þórður Sigurðsson"                     
## [107] "Þorgeir Guðmundsson"                   
## [108] "Þorgerður Runólfsdóttir"               
## [109] "Þorkell Teitsson"                      
## [110] "Þorsteinn Helgason"                    
## [111] "Þórunn Hannesdóttir"                   
## [112] "Þórunn Pálsdóttir"                     
## [113] "Þuríður Hallgrímsdóttir"
cleaned_df$GENDER <- as.factor(cleaned_df$GENDER)
levels(cleaned_df$GENDER)
## [1] "FEMALE" "MALE"
cleaned_df$ORIGINREGION <- as.factor(cleaned_df$ORIGINREGION)
levels(cleaned_df$ORIGINREGION)
## [1] "E"       "N"       "NE"      "NW"      "S"       "SW"      "UNKNOWN"
## [8] "W"       "Wf."
cleaned_df$GENDER <- relevel(cleaned_df$GENDER, ref = "MALE")
cleaned_df$ORIGINREGION <- relevel(cleaned_df$ORIGINREGION, ref = "SW")

Models

Fit maximal model with maximal random effects including slope per individual. Make sure to load lme4 first. Second random effect is a nested intercept for letter within individual author.

library(lme4)
## Warning: package 'lme4' was built under R version 3.6.2
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
m1 <- glmer(response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + (AGE_scaled|NAME) + (1|NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df ,
            control = glmerControl(optimizer = "bobyqa"))

summary(m1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION +  
##     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    Data: cleaned_df
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   5838.1   5995.5  -2897.0   5794.1     9446 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2336 -0.3199 -0.2255 -0.1569  8.8578 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev. Corr
##  NAME:LETTER (Intercept) 0.2567   0.5066       
##  NAME        (Intercept) 0.6362   0.7977       
##              AGE_scaled  0.1586   0.3983   0.33
## Number of obs: 9468, groups:  NAME:LETTER, 1570; NAME, 113
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           -0.78266    0.61234  -1.278  0.20120    
## TensecPresent                         -1.44570    0.10083 -14.338  < 2e-16 ***
## year_scaled                            1.13597    0.17396   6.530 6.57e-11 ***
## GENDERFEMALE                           0.97028    0.41759   2.324  0.02015 *  
## RANKNROTHER PROFESSIONS                1.66276    0.57010   2.917  0.00354 ** 
## RANKNRPEASANTS/LABOURERS               1.94213    0.47093   4.124 3.72e-05 ***
## ORIGINREGIONE                         -0.15597    0.56051  -0.278  0.78080    
## ORIGINREGIONN                          1.67524    1.51684   1.104  0.26941    
## ORIGINREGIONNE                        -0.78161    0.59082  -1.323  0.18586    
## ORIGINREGIONNW                        -0.70688    0.64411  -1.097  0.27245    
## ORIGINREGIONS                         -1.86614    0.78601  -2.374  0.01759 *  
## ORIGINREGIONUNKNOWN                   -1.18512    0.59332  -1.997  0.04578 *  
## ORIGINREGIONW                         -1.08927    0.63230  -1.723  0.08494 .  
## ORIGINREGIONWf.                       -1.00422    0.94436  -1.063  0.28760    
## AGE_scaled                            -0.23893    0.09969  -2.397  0.01655 *  
## TensecPresent:year_scaled             -0.94547    0.11177  -8.459  < 2e-16 ***
## GENDERFEMALE:RANKNROTHER PROFESSIONS  -0.14980    0.85042  -0.176  0.86018    
## GENDERFEMALE:RANKNRPEASANTS/LABOURERS -1.84489    0.60745  -3.037  0.00239 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 18 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

Model converges, we compare it to a simpler model; first simplifying random effects; removing the slope for scaled_age.

m2 <- glmer(response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + (1 |NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df ,
            control = glmerControl(optimizer = "bobyqa"))

summary(m2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION +  
##     AGE_scaled + (1 | NAME:LETTER)
##    Data: cleaned_df
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   5996.0   6131.9  -2979.0   5958.0     9449 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.3560 -0.3184 -0.2086 -0.1602  6.2433 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  NAME:LETTER (Intercept) 0.8334   0.9129  
## Number of obs: 9468, groups:  NAME:LETTER, 1570
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           -1.17272    0.32876  -3.567 0.000361 ***
## TensecPresent                         -1.47162    0.10537 -13.966  < 2e-16 ***
## year_scaled                            1.25065    0.12615   9.914  < 2e-16 ***
## GENDERFEMALE                           0.82782    0.20750   3.990 6.62e-05 ***
## RANKNROTHER PROFESSIONS                1.80996    0.31223   5.797 6.76e-09 ***
## RANKNRPEASANTS/LABOURERS               1.67775    0.23898   7.020 2.21e-12 ***
## ORIGINREGIONE                          0.42056    0.28212   1.491 0.136030    
## ORIGINREGIONN                          2.36837    1.22852   1.928 0.053877 .  
## ORIGINREGIONNE                        -0.43630    0.27788  -1.570 0.116389    
## ORIGINREGIONNW                        -0.32923    0.36262  -0.908 0.363925    
## ORIGINREGIONS                         -0.62583    0.32806  -1.908 0.056433 .  
## ORIGINREGIONUNKNOWN                   -0.53383    0.31754  -1.681 0.092735 .  
## ORIGINREGIONW                         -1.33024    0.29012  -4.585 4.54e-06 ***
## ORIGINREGIONWf.                       -0.09370    0.34474  -0.272 0.785765    
## AGE_scaled                            -0.29322    0.05106  -5.743 9.33e-09 ***
## TensecPresent:year_scaled             -1.06922    0.11638  -9.187  < 2e-16 ***
## GENDERFEMALE:RANKNROTHER PROFESSIONS  -0.19274    0.54909  -0.351 0.725571    
## GENDERFEMALE:RANKNRPEASANTS/LABOURERS -1.94939    0.38474  -5.067 4.05e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 18 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
anova(m1,m2)
## Data: cleaned_df
## Models:
## m2: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + 
## m2:     AGE_scaled + (1 | NAME:LETTER)
## m1: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + 
## m1:     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    npar    AIC    BIC logLik deviance  Chisq Df Pr(>Chisq)    
## m2   19 5996.0 6131.9  -2979   5958.0                         
## m1   22 5838.1 5995.5  -2897   5794.1 163.89  3  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

More complex model wins out. We keep the complex random effects. What about a model with only a random effect for author and not for letter?

m3 <- glmer(response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + (AGE_scaled|NAME) + (1 |NAME),
            family = binomial(link = "logit"), 
            data = cleaned_df ,
            control = glmerControl(optimizer = "bobyqa"))

summary(m3)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION +  
##     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME)
##    Data: cleaned_df
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   5853.3   6010.7  -2904.7   5809.3     9446 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7416 -0.3246 -0.2317 -0.1541  9.9442 
## 
## Random effects:
##  Groups Name        Variance Std.Dev. Corr
##  NAME   (Intercept) 0.4179   0.6465       
##         AGE_scaled  0.1679   0.4097   0.42
##  NAME.1 (Intercept) 0.2544   0.5044       
## Number of obs: 9468, groups:  NAME, 113
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           -0.69254    0.60232  -1.150  0.25023    
## TensecPresent                         -1.41190    0.09941 -14.203  < 2e-16 ***
## year_scaled                            1.09648    0.17084   6.418 1.38e-10 ***
## GENDERFEMALE                           0.90828    0.41382   2.195  0.02817 *  
## RANKNROTHER PROFESSIONS                1.58627    0.55717   2.847  0.00441 ** 
## RANKNRPEASANTS/LABOURERS               1.86269    0.46024   4.047 5.18e-05 ***
## ORIGINREGIONE                         -0.16434    0.55768  -0.295  0.76823    
## ORIGINREGIONN                          1.71322    1.46513   1.169  0.24227    
## ORIGINREGIONNE                        -0.79377    0.57341  -1.384  0.16626    
## ORIGINREGIONNW                        -0.73295    0.62774  -1.168  0.24296    
## ORIGINREGIONS                         -1.91123    0.76753  -2.490  0.01277 *  
## ORIGINREGIONUNKNOWN                   -1.20226    0.57753  -2.082  0.03737 *  
## ORIGINREGIONW                         -1.06417    0.62228  -1.710  0.08724 .  
## ORIGINREGIONWf.                       -1.06554    0.93768  -1.136  0.25581    
## AGE_scaled                            -0.23191    0.09764  -2.375  0.01754 *  
## TensecPresent:year_scaled             -0.90527    0.10960  -8.260  < 2e-16 ***
## GENDERFEMALE:RANKNROTHER PROFESSIONS  -0.13289    0.83791  -0.159  0.87399    
## GENDERFEMALE:RANKNRPEASANTS/LABOURERS -1.81181    0.59365  -3.052  0.00227 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 18 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
anova(m3,m1)
## Data: cleaned_df
## Models:
## m3: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + 
## m3:     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME)
## m1: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + 
## m1:     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## m3   22 5853.3 6010.7 -2904.7   5809.3                         
## m1   22 5838.1 5995.5 -2897.0   5794.1 15.238  0  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

More complex model is better. Inclusion of full random effects structure is justified. We do not simplify the random effects any further. Now move on to simplifying fixed effects, starting with interaction terms.Downward stepwise in order of least to most theoretically justified; GENDER*RANK first.

m4 <- glmer(response ~ Tensec * year_scaled + GENDER + RANKNR + ORIGINREGION + AGE_scaled + (AGE_scaled|NAME) + (1 |NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df ,
            control = glmerControl(optimizer = "bobyqa"))

summary(m4)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: response ~ Tensec * year_scaled + GENDER + RANKNR + ORIGINREGION +  
##     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    Data: cleaned_df
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   5843.5   5986.6  -2901.7   5803.5     9448 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2073 -0.3193 -0.2254 -0.1535  7.1977 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev. Corr
##  NAME:LETTER (Intercept) 0.2590   0.5090       
##  NAME        (Intercept) 0.7238   0.8508       
##              AGE_scaled  0.1801   0.4243   0.17
## Number of obs: 9468, groups:  NAME:LETTER, 1570; NAME, 113
## 
## Fixed effects:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -0.3919     0.6458  -0.607  0.54394    
## TensecPresent              -1.4384     0.1008 -14.268  < 2e-16 ***
## year_scaled                 1.1553     0.1792   6.447 1.14e-10 ***
## GENDERFEMALE                0.2356     0.2912   0.809  0.41831    
## RANKNROTHER PROFESSIONS     1.2378     0.4780   2.590  0.00961 ** 
## RANKNRPEASANTS/LABOURERS    1.2131     0.4164   2.913  0.00358 ** 
## ORIGINREGIONE               0.1231     0.6341   0.194  0.84602    
## ORIGINREGIONN               1.0220     1.5251   0.670  0.50281    
## ORIGINREGIONNE             -0.5819     0.6952  -0.837  0.40255    
## ORIGINREGIONNW             -0.5715     0.7411  -0.771  0.44065    
## ORIGINREGIONS              -1.8866     0.9543  -1.977  0.04805 *  
## ORIGINREGIONUNKNOWN        -1.1350     0.7205  -1.575  0.11520    
## ORIGINREGIONW              -0.8984     0.7499  -1.198  0.23090    
## ORIGINREGIONWf.            -1.2697     1.0639  -1.193  0.23270    
## AGE_scaled                 -0.2454     0.1055  -2.325  0.02008 *  
## TensecPresent:year_scaled  -0.9481     0.1118  -8.479  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 16 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
anova(m4,m1)
## Data: cleaned_df
## Models:
## m4: response ~ Tensec * year_scaled + GENDER + RANKNR + ORIGINREGION + 
## m4:     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME:LETTER)
## m1: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + 
## m1:     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    npar    AIC    BIC  logLik deviance Chisq Df Pr(>Chisq)   
## m4   20 5843.5 5986.6 -2901.7   5803.5                       
## m1   22 5838.1 5995.5 -2897.0   5794.1 9.387  2   0.009155 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

More complex model is justified. What about the Tense*year interaction?

m5 <- glmer(response ~ Tensec + year_scaled + GENDER + RANKNR + ORIGINREGION + AGE_scaled + (AGE_scaled|NAME) +  (1 |NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df ,
            control = glmerControl(optimizer = "bobyqa"))

summary(m5)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: response ~ Tensec + year_scaled + GENDER + RANKNR + ORIGINREGION +  
##     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    Data: cleaned_df
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   5931.2   6067.2  -2946.6   5893.2     9449 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.0597 -0.3561 -0.2102 -0.1306  8.5967 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev. Corr
##  NAME:LETTER (Intercept) 0.2237   0.4730       
##  NAME        (Intercept) 0.7471   0.8643       
##              AGE_scaled  0.1780   0.4219   0.09
## Number of obs: 9468, groups:  NAME:LETTER, 1570; NAME, 113
## 
## Fixed effects:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -0.45928    0.65134  -0.705  0.48073    
## TensecPresent            -1.51869    0.09138 -16.619  < 2e-16 ***
## year_scaled               0.35101    0.14935   2.350  0.01876 *  
## GENDERFEMALE              0.23218    0.29538   0.786  0.43185    
## RANKNROTHER PROFESSIONS   1.25422    0.48132   2.606  0.00917 ** 
## RANKNRPEASANTS/LABOURERS  1.22444    0.42357   2.891  0.00384 ** 
## ORIGINREGIONE             0.19480    0.64208   0.303  0.76160    
## ORIGINREGIONN             1.06231    1.48677   0.715  0.47491    
## ORIGINREGIONNE           -0.53456    0.70273  -0.761  0.44684    
## ORIGINREGIONNW           -0.53811    0.74849  -0.719  0.47219    
## ORIGINREGIONS            -1.82121    0.96946  -1.879  0.06030 .  
## ORIGINREGIONUNKNOWN      -1.04218    0.72727  -1.433  0.15185    
## ORIGINREGIONW            -0.84436    0.75340  -1.121  0.26240    
## ORIGINREGIONWf.          -1.24757    1.10548  -1.129  0.25909    
## AGE_scaled               -0.23909    0.10547  -2.267  0.02340 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 15 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
anova(m5,m1)
## Data: cleaned_df
## Models:
## m5: response ~ Tensec + year_scaled + GENDER + RANKNR + ORIGINREGION + 
## m5:     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME:LETTER)
## m1: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + 
## m1:     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## m5   19 5931.2 6067.2 -2946.6   5893.2                         
## m1   22 5838.1 5995.5 -2897.0   5794.1 99.162  3  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

More complex model is better. What about the inclusions of ORIGINREGION and age? We start by removing region.

m6 <- glmer(response ~ Tensec * year_scaled + GENDER * RANKNR + AGE_scaled + (AGE_scaled|NAME) + (1 |NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df ,
            control = glmerControl(optimizer = "bobyqa"))

summary(m6)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: response ~ Tensec * year_scaled + GENDER * RANKNR + AGE_scaled +  
##     (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    Data: cleaned_df
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   5835.9   5936.1  -2904.0   5807.9     9454 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2058 -0.3206 -0.2252 -0.1562 10.7230 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev. Corr 
##  NAME:LETTER (Intercept) 0.2694   0.5190        
##  NAME        (Intercept) 0.5757   0.7588        
##              AGE_scaled  0.1712   0.4137   -0.18
## Number of obs: 9468, groups:  NAME:LETTER, 1570; NAME, 113
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           -1.64263    0.36457  -4.506 6.62e-06 ***
## TensecPresent                         -1.44433    0.10066 -14.348  < 2e-16 ***
## year_scaled                            1.25435    0.16868   7.436 1.04e-13 ***
## GENDERFEMALE                           1.18875    0.41378   2.873 0.004067 ** 
## RANKNROTHER PROFESSIONS                1.51886    0.54395   2.792 0.005234 ** 
## RANKNRPEASANTS/LABOURERS               1.70482    0.47316   3.603 0.000315 ***
## AGE_scaled                            -0.28285    0.09625  -2.939 0.003295 ** 
## TensecPresent:year_scaled             -0.94539    0.11168  -8.465  < 2e-16 ***
## GENDERFEMALE:RANKNROTHER PROFESSIONS  -0.28757    0.83031  -0.346 0.729084    
## GENDERFEMALE:RANKNRPEASANTS/LABOURERS -1.70097    0.62089  -2.740 0.006152 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##               (Intr) TnscPr yr_scl GENDERFEMALE RANKNP RANKNR AGE_sc TnsP:_
## TensecPrsnt   -0.198                                                       
## year_scaled    0.180 -0.050                                                
## GENDERFEMALE  -0.829 -0.023 -0.165                                         
## RANKNROTHEP   -0.656 -0.030 -0.387  0.584                                  
## RANKNRPEASA   -0.839 -0.032 -0.395  0.750        0.691                     
## AGE_scaled    -0.020  0.016 -0.262  0.020        0.192  0.172              
## TnscPrsnt:_   -0.028  0.041 -0.595  0.002        0.002  0.006  0.008       
## GENDERFEMAP    0.397  0.014  0.233 -0.491       -0.625 -0.416 -0.150  0.001
## GENDERFEMALE:  0.614  0.045  0.192 -0.736       -0.463 -0.697 -0.119  0.003
##               GENDEP
## TensecPrsnt         
## year_scaled         
## GENDERFEMALE        
## RANKNROTHEP         
## RANKNRPEASA         
## AGE_scaled          
## TnscPrsnt:_         
## GENDERFEMAP         
## GENDERFEMALE:  0.374
anova(m6, m1)
## Data: cleaned_df
## Models:
## m6: response ~ Tensec * year_scaled + GENDER * RANKNR + AGE_scaled + 
## m6:     (AGE_scaled | NAME) + (1 | NAME:LETTER)
## m1: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + 
## m1:     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    npar    AIC    BIC logLik deviance  Chisq Df Pr(>Chisq)  
## m6   14 5835.9 5936.1  -2904   5807.9                       
## m1   22 5838.1 5995.5  -2897   5794.1 13.851  8    0.08573 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

m1 is not significantly better than m6; region is removed from the model. What about age more generally? I begin by removing it as a fixed effect.

m7 <- glmer(response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + (1 |NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df ,
            control = glmerControl(optimizer = "bobyqa"))

summary(m7)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION +  
##     (1 | NAME:LETTER)
##    Data: cleaned_df
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   6027.0   6155.8  -2995.5   5991.0     9450 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.8070 -0.3147 -0.2068 -0.1597  6.5452 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  NAME:LETTER (Intercept) 0.9177   0.958   
## Number of obs: 9468, groups:  NAME:LETTER, 1570
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                            -1.3394     0.3318  -4.037 5.42e-05 ***
## TensecPresent                          -1.4633     0.1055 -13.873  < 2e-16 ***
## year_scaled                             1.1857     0.1264   9.382  < 2e-16 ***
## GENDERFEMALE                            1.0235     0.2049   4.995 5.89e-07 ***
## RANKNROTHER PROFESSIONS                 2.1390     0.3119   6.859 6.95e-12 ***
## RANKNRPEASANTS/LABOURERS                1.8582     0.2397   7.751 9.13e-15 ***
## ORIGINREGIONE                           0.5649     0.2868   1.969   0.0489 *  
## ORIGINREGIONN                           2.4042     1.2517   1.921   0.0548 .  
## ORIGINREGIONNE                         -0.3412     0.2826  -1.207   0.2273    
## ORIGINREGIONNW                         -0.1805     0.3679  -0.491   0.6236    
## ORIGINREGIONS                          -0.5318     0.3316  -1.604   0.1087    
## ORIGINREGIONUNKNOWN                    -0.3983     0.3242  -1.229   0.2193    
## ORIGINREGIONW                          -1.2168     0.2939  -4.140 3.47e-05 ***
## ORIGINREGIONWf.                        -0.2652     0.3494  -0.759   0.4479    
## TensecPresent:year_scaled              -1.0699     0.1171  -9.139  < 2e-16 ***
## GENDERFEMALE:RANKNROTHER PROFESSIONS   -0.6836     0.5510  -1.241   0.2147    
## GENDERFEMALE:RANKNRPEASANTS/LABOURERS  -2.3824     0.3845  -6.197 5.77e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 17 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
anova(m7,m1)
## Data: cleaned_df
## Models:
## m7: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + 
## m7:     (1 | NAME:LETTER)
## m1: response ~ Tensec * year_scaled + GENDER * RANKNR + ORIGINREGION + 
## m1:     AGE_scaled + (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    npar    AIC    BIC  logLik deviance Chisq Df Pr(>Chisq)    
## m7   18 6027.0 6155.8 -2995.5   5991.0                        
## m1   22 5838.1 5995.5 -2897.0   5794.1 196.9  4  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

m1 is better; now removing ORIGINREGION we compare models with age included to without, i.e. model below and m6.

m8 <- glmer(response ~ Tensec * year_scaled + GENDER * RANKNR + (1 |NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df ,
            control = glmerControl(optimizer = "bobyqa"))
anova(m8,m6)
## Data: cleaned_df
## Models:
## m8: response ~ Tensec * year_scaled + GENDER * RANKNR + (1 | NAME:LETTER)
## m6: response ~ Tensec * year_scaled + GENDER * RANKNR + AGE_scaled + 
## m6:     (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## m8   10 6113.9 6185.4 -3046.9   6093.9                         
## m6   14 5835.9 5936.1 -2904.0   5807.9 285.94  4  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

m6 (more complex) is significantly better even with the additional parameters; that is the final model.

Model diagnostics

library(DHARMa)
## Warning: package 'DHARMa' was built under R version 3.6.2
## This is DHARMa 0.3.2.0. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa') Note: Syntax of plotResiduals has changed in 0.3.0, see ?plotResiduals for details
diagnostics_output <- simulateResiduals(fittedModel = m6)
plot(diagnostics_output)

Residuals are unconcerning, qqplot looks good and residual-predicted plot has no discernible pattern. As to the significant KS test; given the large sample size this is not surprising (higher sample size results in higher sensitivity of such tests).

testOutliers(diagnostics_output)

## 
##  DHARMa outlier test based on exact binomial test
## 
## data:  diagnostics_output
## outliers at both margin(s) = 0, simulations = 9468, p-value < 2.2e-16
## alternative hypothesis: true probability of success is not equal to 0.007968127
## 95 percent confidence interval:
##  0.0000000000 0.0003895396
## sample estimates:
## frequency of outliers (expected: 0.00796812749003984 ) 
##                                                      0
testDispersion(diagnostics_output, alternative="greater")

## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## ratioObsSim = 0.91705, p-value = 0.924
## alternative hypothesis: greater

Dispersion not an issue. Model performs ‘better than expected’ wrt outliers, i.e. too few: https://github.com/florianhartig/DHARMa/issues/197. Not a problem with specification. We can also test for zero inflation.

testZeroInflation(diagnostics_output)

## 
##  DHARMa zero-inflation test via comparison to expected zeros with
##  simulation under H0 = fitted model
## 
## data:  simulationOutput
## ratioObsSim = 1.0329, p-value = 0.152
## alternative hypothesis: two.sided

Zero inflation also not an issue. Now we can look at performance measures, starting with pseudo R^2 using the MuMIn package, based on Nakagawa et al. (2017). We can look at the values for both the maximal model as well as the best model determined algorithmically.

library(MuMIn)

r2nakagawa_m6 <- r.squaredGLMM(m6)
## Warning: 'r.squaredGLMM' now calculates a revised statistic. See the help page.
## Warning: the null model is correct only if all variables used by the original
## model remain unchanged.
r2nakagawa_m1 <- r.squaredGLMM(m1)
## Warning: the null model is correct only if all variables used by the original
## model remain unchanged.
r2nakagawa_m1
##                   R2m       R2c
## theoretical 0.3220195 0.4855288
## delta       0.2543800 0.3835446
r2nakagawa_m6
##                   R2m       R2c
## theoretical 0.2396493 0.4186807
## delta       0.1842819 0.3219508

Not surprisingly, the maximal model explains more of the variance in the fixed effects. Overall, these values indicate reasonable fit. We can also examine the C-index and Dxy measures.

library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.6.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.2
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.2
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
pred_probs <- predict(m6, type = "response")
C_index <- rcorr.cens(pred_probs, cleaned_df$response)
C_index
##        C Index            Dxy           S.D.              n        missing 
##   8.600587e-01   7.201173e-01   1.147977e-02   9.468000e+03   0.000000e+00 
##     uncensored Relevant Pairs     Concordant      Uncertain 
##   9.468000e+03   2.101651e+07   1.807543e+07   0.000000e+00

These values indicate reasonable discriminative ability.

Plots

We can now plot the results. Beginning with an interaction plot for GENDER*RANK stratified by Tense. year_scaled [all] allows for smoothing. The correspondence to scaled year is such that -2 is 1785, -1,5 is 1800, -0,7 is 1825, 0 is 1850; 1900 is 1,75.

library(ggeffects)
## Warning: package 'ggeffects' was built under R version 3.6.2
effects <- ggpredict(m6 , terms = c("year_scaled [all]", "GENDER", "RANKNR", "Tensec"))
plot(effects) + theme_minimal()

## NULL

Next we plot the two way interactions, i.e GENDER * RANK and YEAR * TENSE.

gender_rank <- ggpredict(m6 , terms = c("GENDER", "RANKNR")) 
plot(gender_rank) + theme_minimal()

gender_rank_bytense <- ggpredict(m6 , terms = c("GENDER", "RANKNR", "Tensec")) 
plot(gender_rank_bytense) + theme_minimal()

year_tense <- ggpredict(m6 , terms = c( "year_scaled [all]", "Tensec")) 
plot(year_tense) + theme_minimal()

A standard effect plot is useful as well; for this I use the package sjPlot.

library(sjPlot)
## Warning: package 'sjPlot' was built under R version 3.6.2
model_plot <- plot_model(m6, sort.est = TRUE, show.values = TRUE, value.offset = .3)
model_plot + theme_minimal()

Finally we can look at partial dependence plots. These show the effect of a single predictor accounting for ‘average’ effects of others. Useful for seeing overall trends. We can look at: year (scaled/centred), GENDER, RANKNR, TENSE and AGE (scaled/centred) (in that order).

pdp_yearcs <- ggpredict(m6, terms = "year_scaled [all]")
plot(pdp_yearcs) + theme_minimal()

pdp_gender <- ggpredict(m6, terms = "GENDER")
plot(pdp_gender) + theme_minimal()

pdp_rank <- ggpredict(m6, terms = "RANKNR")
plot(pdp_rank) + theme_minimal()

pdp_tense <- ggpredict(m6, terms = "Tensec")
plot(pdp_tense) + theme_minimal()

pdp_age <- ggpredict(m6, terms = "AGE_scaled [all]")
plot(pdp_age) + theme_minimal()

Model with ‘present’ only

We can fit a model with present tense only for comparison. Fit most complex model first.

cleaned_df_pres <- cleaned_df %>%
  filter(Tensec == "Present")

m1_pres <- glmer(response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + (AGE_scaled|NAME) + (1|NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df_pres,
            control = glmerControl(optimizer = "bobyqa"))

summary(m1_pres)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled +  
##     (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    Data: cleaned_df_pres
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   4896.6   5036.5  -2428.3   4856.6     8042 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2898 -0.3029 -0.2275 -0.1581  8.4991 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev. Corr
##  NAME:LETTER (Intercept) 0.3108   0.5575       
##  NAME        (Intercept) 0.5932   0.7702       
##              AGE_scaled  0.1583   0.3979   0.31
## Number of obs: 8062, groups:  NAME:LETTER, 1512; NAME, 112
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           -2.03481    0.61766  -3.294 0.000986 ***
## year_scaled                            0.24746    0.14682   1.685 0.091908 .  
## GENDERFEMALE                           0.89849    0.43506   2.065 0.038904 *  
## RANKNROTHER PROFESSIONS                1.57825    0.59282   2.662 0.007761 ** 
## RANKNRPEASANTS/LABOURERS               1.86954    0.50202   3.724 0.000196 ***
## ORIGINREGIONE                         -0.39338    0.55885  -0.704 0.481487    
## ORIGINREGIONN                          1.26700    1.53287   0.827 0.408491    
## ORIGINREGIONNE                        -1.03299    0.60022  -1.721 0.085244 .  
## ORIGINREGIONNW                        -1.09159    0.67739  -1.611 0.107081    
## ORIGINREGIONS                         -1.95698    0.81757  -2.394 0.016681 *  
## ORIGINREGIONUNKNOWN                   -1.38060    0.61643  -2.240 0.025111 *  
## ORIGINREGIONW                         -1.31792    0.63976  -2.060 0.039395 *  
## ORIGINREGIONWf.                       -1.21792    0.96641  -1.260 0.207577    
## AGE_scaled                            -0.22613    0.10273  -2.201 0.027727 *  
## GENDERFEMALE:RANKNROTHER PROFESSIONS   0.06773    0.86129   0.079 0.937322    
## GENDERFEMALE:RANKNRPEASANTS/LABOURERS -1.64463    0.63518  -2.589 0.009619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 16 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

Proceeding with the algorithmic procedure, we start by removing the slope for age.

m2_pres <- glmer(response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + (1 |NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df_pres,
            control = glmerControl(optimizer = "bobyqa"))

summary(m2_pres)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled +  
##     (1 | NAME:LETTER)
##    Data: cleaned_df_pres
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   5026.2   5145.2  -2496.1   4992.2     8045 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.3468 -0.3068 -0.2011 -0.1569  6.6451 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  NAME:LETTER (Intercept) 0.953    0.9762  
## Number of obs: 8062, groups:  NAME:LETTER, 1512
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           -2.40517    0.34918  -6.888 5.65e-12 ***
## year_scaled                            0.24506    0.07730   3.170 0.001524 ** 
## GENDERFEMALE                           0.78022    0.23249   3.356 0.000791 ***
## RANKNROTHER PROFESSIONS                1.71236    0.33968   5.041 4.63e-07 ***
## RANKNRPEASANTS/LABOURERS               1.62057    0.26423   6.133 8.61e-10 ***
## ORIGINREGIONE                          0.08101    0.30088   0.269 0.787747    
## ORIGINREGIONN                          1.97208    1.30307   1.513 0.130176    
## ORIGINREGIONNE                        -0.74509    0.29736  -2.506 0.012222 *  
## ORIGINREGIONNW                        -0.64862    0.39121  -1.658 0.097323 .  
## ORIGINREGIONS                         -0.82685    0.35711  -2.315 0.020593 *  
## ORIGINREGIONUNKNOWN                   -0.78638    0.34692  -2.267 0.023406 *  
## ORIGINREGIONW                         -1.53189    0.31202  -4.910 9.13e-07 ***
## ORIGINREGIONWf.                       -0.26945    0.37609  -0.716 0.473719    
## AGE_scaled                            -0.29046    0.05565  -5.220 1.79e-07 ***
## GENDERFEMALE:RANKNROTHER PROFESSIONS   0.01450    0.59244   0.024 0.980469    
## GENDERFEMALE:RANKNRPEASANTS/LABOURERS -1.88774    0.42958  -4.394 1.11e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 16 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
anova(m2_pres, m1_pres)
## Data: cleaned_df_pres
## Models:
## m2_pres: response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + 
## m2_pres:     (1 | NAME:LETTER)
## m1_pres: response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + 
## m1_pres:     (AGE_scaled | NAME) + (1 | NAME:LETTER)
##         npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## m2_pres   17 5026.2 5145.2 -2496.1   4992.2                         
## m1_pres   20 4896.6 5036.5 -2428.3   4856.6 135.68  3  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

More complex model justified. What about a model with only a random effect for author and not for letter?

m3_pres <- glmer(response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + (AGE_scaled|NAME) + (1 |NAME),
            family = binomial(link = "logit"), 
            data = cleaned_df_pres,
            control = glmerControl(optimizer = "bobyqa"))

summary(m3_pres)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled +  
##     (AGE_scaled | NAME) + (1 | NAME)
##    Data: cleaned_df_pres
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   4913.0   5052.9  -2436.5   4873.0     8042 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2684 -0.2943 -0.2292 -0.1533  9.8581 
## 
## Random effects:
##  Groups Name        Variance Std.Dev. Corr
##  NAME   (Intercept) 0.2164   0.4652       
##         AGE_scaled  0.1682   0.4101   0.57
##  NAME.1 (Intercept) 0.4096   0.6400       
## Number of obs: 8062, groups:  NAME, 112
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           -1.90932    0.60526  -3.155 0.001607 ** 
## year_scaled                            0.23653    0.14266   1.658 0.097309 .  
## GENDERFEMALE                           0.86740    0.43206   2.008 0.044689 *  
## RANKNROTHER PROFESSIONS                1.55255    0.57541   2.698 0.006973 ** 
## RANKNRPEASANTS/LABOURERS               1.83654    0.48921   3.754 0.000174 ***
## ORIGINREGIONE                         -0.40549    0.55232  -0.734 0.462851    
## ORIGINREGIONN                          1.26966    1.46004   0.870 0.384514    
## ORIGINREGIONNE                        -1.07011    0.58026  -1.844 0.065156 .  
## ORIGINREGIONNW                        -1.13889    0.65652  -1.735 0.082785 .  
## ORIGINREGIONS                         -2.04107    0.79571  -2.565 0.010315 *  
## ORIGINREGIONUNKNOWN                   -1.41467    0.59756  -2.367 0.017913 *  
## ORIGINREGIONW                         -1.32666    0.62831  -2.111 0.034731 *  
## ORIGINREGIONWf.                       -1.27931    0.94669  -1.351 0.176580    
## AGE_scaled                            -0.21505    0.09982  -2.154 0.031206 *  
## GENDERFEMALE:RANKNROTHER PROFESSIONS   0.02975    0.84250   0.035 0.971827    
## GENDERFEMALE:RANKNRPEASANTS/LABOURERS -1.60191    0.62029  -2.583 0.009808 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 16 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
anova(m3_pres, m1_pres)
## Data: cleaned_df_pres
## Models:
## m3_pres: response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + 
## m3_pres:     (AGE_scaled | NAME) + (1 | NAME)
## m1_pres: response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + 
## m1_pres:     (AGE_scaled | NAME) + (1 | NAME:LETTER)
##         npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## m3_pres   20 4913.0 5052.9 -2436.5   4873.0                         
## m1_pres   20 4896.6 5036.5 -2428.3   4856.6 16.402  0  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

More complex model is better. Inclusion of full random effects structure is justified. We do not simplify the random effects any further. Now move on to simplifying fixed effects, starting with interaction terms. Downward stepwise in order of least to most theoretically justified; GENDER*RANK first.

m4_pres <- glmer(response ~ year_scaled + GENDER + RANKNR + ORIGINREGION + AGE_scaled + (AGE_scaled|NAME) + (1 |NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df_pres,
            control = glmerControl(optimizer = "bobyqa"))

summary(m4_pres)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## response ~ year_scaled + GENDER + RANKNR + ORIGINREGION + AGE_scaled +  
##     (AGE_scaled | NAME) + (1 | NAME:LETTER)
##    Data: cleaned_df_pres
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   4900.0   5025.9  -2432.0   4864.0     8044 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2917 -0.3011 -0.2290 -0.1546  7.9165 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev. Corr
##  NAME:LETTER (Intercept) 0.3108   0.5575       
##  NAME        (Intercept) 0.6756   0.8219       
##              AGE_scaled  0.1754   0.4189   0.20
## Number of obs: 8062, groups:  NAME:LETTER, 1512; NAME, 112
## 
## Fixed effects:
##                          Estimate Std. Error z value Pr(>|z|)   
## (Intercept)               -1.6936     0.6365  -2.661  0.00779 **
## year_scaled                0.2572     0.1520   1.692  0.09063 . 
## GENDERFEMALE               0.2763     0.3002   0.920  0.35736   
## RANKNROTHER PROFESSIONS    1.2759     0.4892   2.608  0.00910 **
## RANKNRPEASANTS/LABOURERS   1.2637     0.4363   2.896  0.00378 **
## ORIGINREGIONE             -0.1587     0.6195  -0.256  0.79786   
## ORIGINREGIONN              0.6369     1.5416   0.413  0.67948   
## ORIGINREGIONNE            -0.8786     0.6902  -1.273  0.20306   
## ORIGINREGIONNW            -0.9846     0.7659  -1.285  0.19863   
## ORIGINREGIONS             -2.0154     0.9686  -2.081  0.03746 * 
## ORIGINREGIONUNKNOWN       -1.3774     0.7269  -1.895  0.05811 . 
## ORIGINREGIONW             -1.1740     0.7401  -1.586  0.11270   
## ORIGINREGIONWf.           -1.4491     1.0762  -1.346  0.17817   
## AGE_scaled                -0.2300     0.1077  -2.135  0.03280 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 14 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
anova(m4_pres, m1_pres)
## Data: cleaned_df_pres
## Models:
## m4_pres: response ~ year_scaled + GENDER + RANKNR + ORIGINREGION + AGE_scaled + 
## m4_pres:     (AGE_scaled | NAME) + (1 | NAME:LETTER)
## m1_pres: response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + 
## m1_pres:     (AGE_scaled | NAME) + (1 | NAME:LETTER)
##         npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)  
## m4_pres   18 4900.0 5025.9 -2432.0   4864.0                       
## m1_pres   20 4896.6 5036.5 -2428.3   4856.6 7.3999  2    0.02472 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

More complex model is justified. We can now drop originregion.

m5_pres <- glmer(response ~ year_scaled + GENDER * RANKNR + AGE_scaled + (AGE_scaled|NAME) + (1 |NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df_pres,
            control = glmerControl(optimizer = "bobyqa"))

summary(m5_pres)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: response ~ year_scaled + GENDER * RANKNR + AGE_scaled + (AGE_scaled |  
##     NAME) + (1 | NAME:LETTER)
##    Data: cleaned_df_pres
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   4892.5   4976.5  -2434.3   4868.5     8050 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2580 -0.3029 -0.2290 -0.1577 10.2239 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev. Corr 
##  NAME:LETTER (Intercept) 0.3291   0.5737        
##  NAME        (Intercept) 0.5244   0.7241        
##              AGE_scaled  0.1751   0.4185   -0.28
## Number of obs: 8062, groups:  NAME:LETTER, 1512; NAME, 112
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           -3.01906    0.36604  -8.248  < 2e-16 ***
## year_scaled                            0.37545    0.13691   2.742  0.00610 ** 
## GENDERFEMALE                           1.06222    0.42337   2.509  0.01211 *  
## RANKNROTHER PROFESSIONS                1.31443    0.55535   2.367  0.01794 *  
## RANKNRPEASANTS/LABOURERS               1.48462    0.49432   3.003  0.00267 ** 
## AGE_scaled                            -0.27040    0.09869  -2.740  0.00615 ** 
## GENDERFEMALE:RANKNROTHER PROFESSIONS  -0.01307    0.83498  -0.016  0.98751    
## GENDERFEMALE:RANKNRPEASANTS/LABOURERS -1.39609    0.63517  -2.198  0.02795 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##               (Intr) yr_scl GENDERFEMALE RANKNP RANKNR AGE_sc GENDEP
## year_scaled    0.223                                                
## GENDERFEMALE  -0.854 -0.237                                         
## RANKNROTHEP   -0.686 -0.490  0.599                                  
## RANKNRPEASA   -0.869 -0.505  0.762        0.699                     
## AGE_scaled    -0.058 -0.314  0.046        0.210  0.188              
## GENDERFEMAP    0.420  0.306 -0.503       -0.636 -0.430 -0.160       
## GENDERFEMALE:  0.639  0.272 -0.735       -0.478 -0.702 -0.143  0.389
anova(m5_pres, m1_pres)
## Data: cleaned_df_pres
## Models:
## m5_pres: response ~ year_scaled + GENDER * RANKNR + AGE_scaled + (AGE_scaled | 
## m5_pres:     NAME) + (1 | NAME:LETTER)
## m1_pres: response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + 
## m1_pres:     (AGE_scaled | NAME) + (1 | NAME:LETTER)
##         npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## m5_pres   12 4892.5 4976.5 -2434.3   4868.5                     
## m1_pres   20 4896.6 5036.5 -2428.3   4856.6 11.973  8     0.1524

m5 is better, originregion not justified. What about age, start by removing it as a fixed effect.

m7_pres <- glmer(response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + (1 |NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df_pres,
            control = glmerControl(optimizer = "bobyqa"))

summary(m7_pres)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + (1 |  
##     NAME:LETTER)
##    Data: cleaned_df_pres
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   5051.6   5163.5  -2509.8   5019.6     8046 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.3587 -0.3051 -0.2019 -0.1564  7.1890 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  NAME:LETTER (Intercept) 1.041    1.02    
## Number of obs: 8062, groups:  NAME:LETTER, 1512
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           -2.56723    0.35245  -7.284 3.24e-13 ***
## year_scaled                            0.17751    0.07701   2.305   0.0212 *  
## GENDERFEMALE                           0.99509    0.22854   4.354 1.34e-05 ***
## RANKNROTHER PROFESSIONS                2.04853    0.33827   6.056 1.40e-09 ***
## RANKNRPEASANTS/LABOURERS               1.81379    0.26435   6.861 6.83e-12 ***
## ORIGINREGIONE                          0.20784    0.30583   0.680   0.4968    
## ORIGINREGIONN                          2.00536    1.33013   1.508   0.1316    
## ORIGINREGIONNE                        -0.65092    0.30227  -2.153   0.0313 *  
## ORIGINREGIONNW                        -0.51455    0.39691  -1.296   0.1948    
## ORIGINREGIONS                         -0.73760    0.36067  -2.045   0.0408 *  
## ORIGINREGIONUNKNOWN                   -0.66788    0.35408  -1.886   0.0593 .  
## ORIGINREGIONW                         -1.44070    0.31597  -4.560 5.12e-06 ***
## ORIGINREGIONWf.                       -0.44106    0.38102  -1.158   0.2470    
## GENDERFEMALE:RANKNROTHER PROFESSIONS  -0.48630    0.59273  -0.820   0.4120    
## GENDERFEMALE:RANKNRPEASANTS/LABOURERS -2.34983    0.42821  -5.488 4.08e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 15 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
anova(m7_pres,m1_pres)
## Data: cleaned_df_pres
## Models:
## m7_pres: response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + (1 | 
## m7_pres:     NAME:LETTER)
## m1_pres: response ~ year_scaled + GENDER * RANKNR + ORIGINREGION + AGE_scaled + 
## m1_pres:     (AGE_scaled | NAME) + (1 | NAME:LETTER)
##         npar    AIC    BIC  logLik deviance Chisq Df Pr(>Chisq)    
## m7_pres   16 5051.6 5163.5 -2509.8   5019.6                        
## m1_pres   20 4896.6 5036.5 -2428.3   4856.6   163  4  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

m1 is better; age is justified; now we can compare now removing ORIGINREGION, we compare models with age included to without, i.e. model below and m5

m8_pres <- glmer(response ~ year_scaled + GENDER * RANKNR + (1 |NAME:LETTER),
            family = binomial(link = "logit"), 
            data = cleaned_df_pres,
            control = glmerControl(optimizer = "bobyqa"))
anova(m8_pres,m5_pres)
## Data: cleaned_df_pres
## Models:
## m8_pres: response ~ year_scaled + GENDER * RANKNR + (1 | NAME:LETTER)
## m5_pres: response ~ year_scaled + GENDER * RANKNR + AGE_scaled + (AGE_scaled | 
## m5_pres:     NAME) + (1 | NAME:LETTER)
##         npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## m8_pres    8 5114.9 5170.8 -2549.4   5098.9                         
## m5_pres   12 4892.5 4976.5 -2434.3   4868.5 230.33  4  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

m5_pres is better. Let us now move on the plotting. First the interaction effects.

Model evaluation (Present)

diagnostics_output_pres <- simulateResiduals(fittedModel = m5_pres)
plot(diagnostics_output_pres)

testOutliers(diagnostics_output_pres)

## 
##  DHARMa outlier test based on exact binomial test
## 
## data:  diagnostics_output_pres
## outliers at both margin(s) = 0, simulations = 8062, p-value < 2.2e-16
## alternative hypothesis: true probability of success is not equal to 0.007968127
## 95 percent confidence interval:
##  0.0000000000 0.0004574591
## sample estimates:
## frequency of outliers (expected: 0.00796812749003984 ) 
##                                                      0
testDispersion(diagnostics_output_pres)

## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## ratioObsSim = 0.93893, p-value = 0.296
## alternative hypothesis: two.sided
testZeroInflation(diagnostics_output_pres)

## 
##  DHARMa zero-inflation test via comparison to expected zeros with
##  simulation under H0 = fitted model
## 
## data:  simulationOutput
## ratioObsSim = 1.024, p-value = 0.256
## alternative hypothesis: two.sided

Nothing overly concerning on visual inspection of these plots. Deviations w.r.t KS and outliers are also not a concern given the output above.

Goodness of fit measures

r2nakagawa_m5_pres <- r.squaredGLMM(m5_pres)
## Warning: the null model is correct only if all variables used by the original
## model remain unchanged.
r2nakagawa_m5_pres
##                   R2m       R2c
## theoretical 0.1789610 0.3740062
## delta       0.1225523 0.2561191
pred_probs_pres <- predict(m5_pres, type = "response")
C_index_pres <- rcorr.cens(pred_probs, cleaned_df$response)
C_index_pres
##        C Index            Dxy           S.D.              n        missing 
##   8.600587e-01   7.201173e-01   1.147977e-02   9.468000e+03   0.000000e+00 
##     uncensored Relevant Pairs     Concordant      Uncertain 
##   9.468000e+03   2.101651e+07   1.807543e+07   0.000000e+00

These indicate reasonable performance but suggest that there is no justification for fitting separate models.

Plots (Part 2)

First, a standard effect plot

model_plot_pres <- plot_model(m5_pres, sort.est = TRUE, show.values = TRUE, value.offset = .3)
model_plot_pres + theme_minimal()

Interaction plots.

effects_pres <- ggpredict(m5_pres , terms = c("year_scaled [all]", "GENDER", "RANKNR"))
plot(effects_pres) + theme_minimal()

Partial dependence plots are also informative here, given that we are only looking at a single level for tense.

pdp_yearcs_pres <- ggpredict(m5_pres, terms = "year_scaled")
## Data were 'prettified'. Consider using `terms="year_scaled [all]"` to get smooth plots.
plot(pdp_yearcs_pres)

pdp_gender_pres <- ggpredict(m5_pres, terms = "GENDER")
plot(pdp_gender_pres)

pdp_rank_pres <- ggpredict(m5_pres, terms = "RANKNR")
plot(pdp_rank_pres)

pdp_age_pres <- ggpredict(m5_pres, terms = "AGE_scaled")
## Data were 'prettified'. Consider using `terms="AGE_scaled [all]"` to get smooth plots.
plot(pdp_age_pres)